{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "# Load environment variables from openai.env file\n",
    "load_dotenv(\"openai.env\")\n",
    "\n",
    "# Read the OPENAI_API_KEY from the environment\n",
    "api_key = os.getenv(\"OPENAI_API_KEY\")\n",
    "api_base = os.getenv(\"OPENAI_API_BASE\")\n",
    "os.environ[\"OPENAI_API_KEY\"] = api_key\n",
    "os.environ[\"OPENAI_API_BASE\"] = api_base"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ChatDoc:又一个智能文档助手\n",
    "\n",
    "- 读取pdf、excel、doc三种常见的文档格式\n",
    "- 根据文档内容，智能抽取内容并输出相应格式\n",
    "<hr>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. 安装相关的包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#安装必须的包\n",
    "! pip install docx2txt\n",
    "! pip install pypdf\n",
    "! pip install nltk"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 第一个测试，加载docx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='一、公司基本信息\\n\\n名称：宏图科技发展有限公司\\n\\n注册地址：江苏省南京市雨花台区软件大道101号\\n\\n成立日期：2011年5月16日\\n\\n法定代表人：李强\\n\\n注册资本：人民币5000万元\\n\\n员工人数：约200人\\n\\n联系电话：025-88888888\\n\\n电子邮箱：info@hongtutech.cn\\n\\n\\n\\n二、财务状况概述\\n\\n截至2023年第一季度，宏图科技发展有限公司财务状况堪忧，具体情况如下：\\n\\n1. 资产总额：人民币1.2亿元，较上年同期下降30%。\\n\\n2. 负债总额：人民币1.8亿元，较上年同期上升50%，资不抵债。\\n\\n3. 营业收入：人民币3000万元，较上年同期下降60%。\\n\\n4. 净利润：亏损人民币800万元，去年同期为盈利人民币200万元。\\n\\n5. 现金流量：公司现金流量紧张，现金及现金等价物余额为人民币500万元，难以支撑日常运营。\\n\\n6. 存货：存货积压严重，库存商品价值约为人民币400万元，大部分产品滞销。\\n\\n7. 应收账款：应收账款高达人民币600万元，回收难度大，坏账准备不足。\\n\\n\\n\\n三、主营业务及市场状况\\n\\n宏图科技发展有限公司主要从事计算机软件的研发与销售。近年来，由于市场竞争加剧、技术更新换代速度快和管理层决策失误等原因，公司主营业务收入持续下降。目前，公司面临的主要问题有：\\n\\n1. 产品同质化严重，缺乏核心竞争力。\\n\\n2. 新产品开发进度缓慢，未能及时抓住市场需求变化。\\n\\n3. 市场营销策略不当，导致市场份额大幅缩水。\\n\\n4. 行业内新兴企业崛起迅速，原有客户流失严重。\\n\\n\\n\\n四、债权债务情况\\n\\n宏图科技发展有限公司目前面临的债务问题严峻，具体情况如下：\\n\\n1. 银行贷款：公司向多家银行贷款总额达人民币1亿元，部分贷款已逾期未还。\\n\\n2. 供应商欠款：因现金流紧张，公司拖欠供应商货款达人民币300万元。\\n\\n3. 员工工资及社保：由于资金链断裂，公司拖欠员工工资及社保费用共计人民币200万元。\\n\\n4. 其他应付款项：包括税费、租赁费用等其他应付款项累计约人民币100万元。\\n\\n\\n\\n五、资产清单\\n\\n宏图科技发展有限公司目前拥有的主要资产包括：\\n\\n1. 固定资产：公司办公用房和设备原值合计人民币800万元，累计折旧约400万元。\\n\\n2. 无形资产：包括软件著作权、专利权等无形资产原值合计人民币300万元。\\n\\n3. 存货资产：存货包括已完成软件产品和半成品，价值约为人民币400万元。\\n\\n4. 应收账款：主要包括对外销售软件的应收账款合计人民币600万元。\\n\\n\\n\\n六、潜在风险及预警\\n\\n1. 经营风险：由于连续亏损，公司可能面临破产清算的风险。\\n\\n2. 债务风险：负债累累，若短期内无法筹措足够资金偿还债务，可能面临诉讼或资产被查封的风险。\\n\\n3. 市场风险：行业竞争加剧和市场需求不明朗，可能导致公司未来业绩继续恶化。\\n\\n4. 法律风险：因未能按时支付债务和相关费用，可能面临相关法律诉讼或处罚。\\n\\n\\n\\n七、结论与建议\\n\\n综上所述，宏图科技发展有限公司目前处于财务困境之中，若无外部资金注入或业务转型成功，短期内难以扭转局势。对于不良资产收购方来说，在考虑收购宏图科技的相关资产前，建议进行深入的尽职调查，并制定详细的风险控制和资产处置方案。同时，在估值时应充分考虑到公司所面临的各种潜在风险和清收难度。\\n\\n\\n\\n报告撰写日期：2023年4月20日', metadata={'source': 'example/fake.docx'})]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#倒入必须的包\n",
    "from langchain.document_loaders import Docx2txtLoader\n",
    "\n",
    "#定义chatdoc\n",
    "class ChatDoc():\n",
    "    def getFile():\n",
    "        #读取文件\n",
    "        loader = Docx2txtLoader(\"example/fake.docx\")\n",
    "        text = loader.load()\n",
    "        return text;\n",
    "\n",
    "ChatDoc.getFile()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 第二个测试，加载pdf文档"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content=' 一、公司基本信息 名称：宏图科技发展有限公司 注册地址：江苏省南京市雨花台区软件大道101号 成立日期：2011年5月16日 法定代表人：李强 注册资本：人民币5000万元 员工人数：约200人 联系电话：025-88888888 电子邮箱：info@hongtutech.cn  二、财务状况概述 截至2023年第一季度，宏图科技发展有限公司财务状况堪忧，具体情况如下： 1. 资产总额：人民币1.2亿元，较上年同期下降30%。 2. 负债总额：人民币1.8亿元，较上年同期上升50%，资不抵债。 3. 营业收入：人民币3000万元，较上年同期下降60%。 4. 净利润：亏损人民币800万元，去年同期为盈利人民币200万元。 5. 现金流量：公司现金流量紧张，现金及现金等价物余额为人民币500万元，难以支撑日常运营。 6. 存货： 存货积压严重， 库存商品价值约为人民币400万元， 大部分产品滞销。 7. 应收账款：应收账款高达人民币600万元，回收难度大，坏账准备不足。  三、主营业务及市场状况 宏图科技发展有限公司主要从事计算机软件的研发与销售。近年来，由于市场竞争加剧、技术更新换代速度快和管理层决策失误等原因，公司主营业务收入持续下降。目前，公司面临的主要问题有： 1. 产品同质化严重，缺乏核心竞争力。 2. 新产品开发进度缓慢，未能及时抓住市场需求变化。 3. 市场营销策略不当，导致市场份额大幅缩水。 ', metadata={'source': 'example/fake.pdf', 'page': 0}),\n",
       " Document(page_content='4. 行业内新兴企业崛起迅速，原有客户流失严重。  四、债权债务情况 宏图科技发展有限公司目前面临的债务问题严峻，具体情况如下： 1. 银行贷款： 公司向多家银行贷款总额达人民币1亿元， 部分贷款已逾期未还。 2. 供应商欠款：因现金流紧张，公司拖欠供应商货款达人民币300万元。 3. 员工工资及社保：由于资金链断裂，公司拖欠员工工资及社保费用共计人民币200万元。 4. 其他应付款项： 包括税费、 租赁费用等其他应付款项累计约人民币100万元。  五、资产清单 宏图科技发展有限公司目前拥有的主要资产包括： 1. 固定资产：公司办公用房和设备原值合计人民币800万元，累计折旧约400万元。 2. 无形资产：包括软件著作权、专利权等无形资产原值合计人民币300万元。 3. 存货资产：存货包括已完成软件产品和半成品，价值约为人民币400万元。 4. 应收账款：主要包括对外销售软件的应收账款合计人民币600万元。  六、潜在风险及预警 1. 经营风险：由于连续亏损，公司可能面临破产清算的风险。 2. 债务风险：负债累累，若短期内无法筹措足够资金偿还债务，可能面临诉讼或资产被查封的风险。 3. 市场风险：行业竞争加剧和市场需求不明朗，可能导致公司未来业绩继续恶化。 4. 法律风险： 因未能按时支付债务和相关费用， 可能面临相关法律诉讼或处罚。  七、结论与建议 综上所述，宏图科技发展有限公司目前处于财务困境之中，若无外部资金注入或业务转型成功，短期内难以扭转局势。对于不良资产收购方来说，在考虑收购宏', metadata={'source': 'example/fake.pdf', 'page': 1}),\n",
       " Document(page_content='图科技的相关资产前，建议进行深入的尽职调查，并制定详细的风险控制和资产处置方案。 同时， 在估值时应充分考虑到公司所面临的各种潜在风险和清收难度。  报告撰写日期：2023年4月20日 ', metadata={'source': 'example/fake.pdf', 'page': 2})]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#导入必须的包\n",
    "from langchain.document_loaders import PyPDFLoader\n",
    "\n",
    "#定义chatdoc\n",
    "class ChatDoc():\n",
    "    def getFile():\n",
    "        try:\n",
    "            #读取文件\n",
    "            loader = PyPDFLoader(\"example/fake.pdf\")\n",
    "            text = loader.load()\n",
    "            return text;\n",
    "        except Exception as e:\n",
    "            print(f\"Error loading files:{e}\")\n",
    "ChatDoc.getFile()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.第三个测试，加载下excel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='\\n\\n\\n名称\\n宏图科技发展有限公司\\n\\n\\n注册地址\\n江苏省南京市雨花台区软件大道101号\\n\\n\\n成立日期\\n40679\\n\\n\\n法定代表人\\n李强\\n\\n\\n注册资本\\n人民币5000万元\\n\\n\\n员工人数\\n约200人\\n\\n\\n联系电话\\n025-88888888\\n\\n\\n电子邮箱\\ninfo@hongtutech.cn\\n\\n\\n资产总额\\n人民币1.2亿元，较上年同期下降30%\\n\\n\\n负债总额\\n人民币1.8亿元，较上年同期上升50%，资不抵债\\n\\n\\n营业收入\\n人民币3000万元，较上年同期下降60%\\n\\n\\n净利润\\n亏损人民币800万元，去年同期为盈利人民币200万元\\n\\n\\n现金流量\\n公司现金流量紧张，现金及现金等价物余额为人民币500万元，难以支撑日常运营\\n\\n\\n', metadata={'source': 'example/fake.xlsx', 'file_directory': 'example', 'filename': 'fake.xlsx', 'last_modified': '2023-12-05T16:45:13', 'page_name': 'Sheet1', 'page_number': 1, 'text_as_html': '<table border=\"1\" class=\"dataframe\">\\n  <tbody>\\n    <tr>\\n      <td>名称</td>\\n      <td>宏图科技发展有限公司</td>\\n    </tr>\\n    <tr>\\n      <td>注册地址</td>\\n      <td>江苏省南京市雨花台区软件大道101号</td>\\n    </tr>\\n    <tr>\\n      <td>成立日期</td>\\n      <td>40679</td>\\n    </tr>\\n    <tr>\\n      <td>法定代表人</td>\\n      <td>李强</td>\\n    </tr>\\n    <tr>\\n      <td>注册资本</td>\\n      <td>人民币5000万元</td>\\n    </tr>\\n    <tr>\\n      <td>员工人数</td>\\n      <td>约200人</td>\\n    </tr>\\n    <tr>\\n      <td>联系电话</td>\\n      <td>025-88888888</td>\\n    </tr>\\n    <tr>\\n      <td>电子邮箱</td>\\n      <td>info@hongtutech.cn</td>\\n    </tr>\\n    <tr>\\n      <td>资产总额</td>\\n      <td>人民币1.2亿元，较上年同期下降30%</td>\\n    </tr>\\n    <tr>\\n      <td>负债总额</td>\\n      <td>人民币1.8亿元，较上年同期上升50%，资不抵债</td>\\n    </tr>\\n    <tr>\\n      <td>营业收入</td>\\n      <td>人民币3000万元，较上年同期下降60%</td>\\n    </tr>\\n    <tr>\\n      <td>净利润</td>\\n      <td>亏损人民币800万元，去年同期为盈利人民币200万元</td>\\n    </tr>\\n    <tr>\\n      <td>现金流量</td>\\n      <td>公司现金流量紧张，现金及现金等价物余额为人民币500万元，难以支撑日常运营</td>\\n    </tr>\\n  </tbody>\\n</table>', 'languages': ['zho', 'kor'], 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Table'})]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#导入必须的包\n",
    "from langchain.document_loaders import UnstructuredExcelLoader\n",
    "\n",
    "#定义chatdoc\n",
    "class ChatDoc():\n",
    "    def getFile():\n",
    "        try:\n",
    "            #读取文件\n",
    "            loader = UnstructuredExcelLoader(\"example/fake.xlsx\",mode=\"elements\")\n",
    "            text = loader.load()\n",
    "            return text;\n",
    "        except Exception as e:\n",
    "            print(f\"Error loading files:{e}\")\n",
    "ChatDoc.getFile()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5. 整合优化，动态加载三种文件格式,增加了文本切割"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Document(page_content='名称\\n宏图科技发展有限公司\\n\\n\\n注册地址\\n江苏省南京市雨花台区软件大道101号\\n\\n\\n成立日期\\n40679\\n\\n\\n法定代表人\\n李强\\n\\n\\n注册资本\\n人民币5000万元\\n\\n\\n员工人数\\n约200人\\n\\n\\n联系电话\\n025-88888888\\n\\n\\n电子邮箱\\ninfo@hongtutech.cn', metadata={'source': 'example/fake.xlsx'}), Document(page_content='资产总额\\n人民币1.2亿元，较上年同期下降30%\\n\\n\\n负债总额\\n人民币1.8亿元，较上年同期上升50%，资不抵债\\n\\n\\n营业收入\\n人民币3000万元，较上年同期下降60%\\n\\n\\n净利润\\n亏损人民币800万元，去年同期为盈利人民币200万元', metadata={'source': 'example/fake.xlsx'}), Document(page_content='现金流量\\n公司现金流量紧张，现金及现金等价物余额为人民币500万元，难以支撑日常运营', metadata={'source': 'example/fake.xlsx'})]\n"
     ]
    }
   ],
   "source": [
    "#导入必须的包\n",
    "from langchain.document_loaders import UnstructuredExcelLoader,Docx2txtLoader,PyPDFLoader\n",
    "from langchain.text_splitter import  CharacterTextSplitter\n",
    "\n",
    "\n",
    "#定义chatdoc\n",
    "class ChatDoc():\n",
    "    def __init__(self):\n",
    "        self.doc = None\n",
    "        self.splitText = [] #分割后的文本\n",
    "\n",
    "    def getFile(self):\n",
    "        doc = self.doc\n",
    "        loaders = {\n",
    "            \"docx\":Docx2txtLoader,\n",
    "            \"pdf\":PyPDFLoader,\n",
    "            \"xlsx\":UnstructuredExcelLoader,\n",
    "        }\n",
    "        file_extension = doc.split(\".\")[-1]\n",
    "        loader_class = loaders.get(file_extension)\n",
    "        if loader_class:\n",
    "            try:\n",
    "                loader = loader_class(doc)\n",
    "                text = loader.load()\n",
    "                return text\n",
    "            except Exception as e: \n",
    "                print(f\"Error loading {file_extension} files:{e}\") \n",
    "        else:\n",
    "             print(f\"Unsupported file extension: {file_extension}\")\n",
    "             return  None \n",
    "\n",
    "    #处理文档的函数\n",
    "    def splitSentences(self):\n",
    "        full_text = self.getFile() #获取文档内容\n",
    "        if full_text != None:\n",
    "            #对文档进行分割\n",
    "            text_split = CharacterTextSplitter(\n",
    "                chunk_size=150,\n",
    "                chunk_overlap=20,\n",
    "            )\n",
    "            texts = text_split.split_documents(full_text)\n",
    "            self.splitText = texts\n",
    "\n",
    "chat_doc = ChatDoc()\n",
    "chat_doc.doc = \"example/fake.xlsx\"\n",
    "chat_doc.splitSentences()\n",
    "print(chat_doc.splitText)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.向量化与存储索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<langchain_community.vectorstores.chroma.Chroma at 0x28f8c2c50>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#导入必须的包\n",
    "from langchain.document_loaders import UnstructuredExcelLoader,Docx2txtLoader,PyPDFLoader\n",
    "from langchain.text_splitter import  CharacterTextSplitter\n",
    "from langchain_openai import OpenAIEmbeddings\n",
    "from langchain.vectorstores import  Chroma\n",
    "\n",
    "\n",
    "#定义chatdoc\n",
    "class ChatDoc():\n",
    "    def __init__(self):\n",
    "        self.doc = None\n",
    "        self.splitText = [] #分割后的文本\n",
    "\n",
    "    def getFile(self):\n",
    "        doc = self.doc\n",
    "        loaders = {\n",
    "            \"docx\":Docx2txtLoader,\n",
    "            \"pdf\":PyPDFLoader,\n",
    "            \"xlsx\":UnstructuredExcelLoader,\n",
    "        }\n",
    "        file_extension = doc.split(\".\")[-1]\n",
    "        loader_class = loaders.get(file_extension)\n",
    "        if loader_class:\n",
    "            try:\n",
    "                loader = loader_class(doc)\n",
    "                text = loader.load()\n",
    "                return text\n",
    "            except Exception as e: \n",
    "                print(f\"Error loading {file_extension} files:{e}\") \n",
    "        else:\n",
    "             print(f\"Unsupported file extension: {file_extension}\")\n",
    "             return  None \n",
    "\n",
    "    #处理文档的函数\n",
    "    def splitSentences(self):\n",
    "        full_text = self.getFile() #获取文档内容\n",
    "        if full_text != None:\n",
    "            #对文档进行分割\n",
    "            text_split = CharacterTextSplitter(\n",
    "                chunk_size=150,\n",
    "                chunk_overlap=20,\n",
    "            )\n",
    "            texts = text_split.split_documents(full_text)\n",
    "            self.splitText = texts\n",
    "    \n",
    "    #向量化与向量存储\n",
    "    def embeddingAndVectorDB(self):\n",
    "        embeddings = OpenAIEmbeddings()\n",
    "        db =Chroma.from_documents(\n",
    "            documents = self.splitText,\n",
    "            embedding = embeddings,\n",
    "        )\n",
    "        return db\n",
    "\n",
    "chat_doc = ChatDoc()\n",
    "chat_doc.doc = \"example/fake.docx\"\n",
    "chat_doc.splitSentences()\n",
    "chat_doc.embeddingAndVectorDB()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 7. 索引并使用自然语言找出相关的文本块"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='一、公司基本信息\\n\\n名称：宏图科技发展有限公司\\n\\n注册地址：江苏省南京市雨花台区软件大道101号\\n\\n成立日期：2011年5月16日\\n\\n法定代表人：李强\\n\\n注册资本：人民币5000万元\\n\\n员工人数：约200人\\n\\n联系电话：025-88888888\\n\\n电子邮箱：info@hongtutech.cn', metadata={'source': 'example/fake.docx'}),\n",
       " Document(page_content='7. 应收账款：应收账款高达人民币600万元，回收难度大，坏账准备不足。\\n\\n三、主营业务及市场状况\\n\\n宏图科技发展有限公司主要从事计算机软件的研发与销售。近年来，由于市场竞争加剧、技术更新换代速度快和管理层决策失误等原因，公司主营业务收入持续下降。目前，公司面临的主要问题有：', metadata={'source': 'example/fake.docx'}),\n",
       " Document(page_content='4. 其他应付款项：包括税费、租赁费用等其他应付款项累计约人民币100万元。\\n\\n五、资产清单\\n\\n宏图科技发展有限公司目前拥有的主要资产包括：\\n\\n1. 固定资产：公司办公用房和设备原值合计人民币800万元，累计折旧约400万元。', metadata={'source': 'example/fake.docx'}),\n",
       " Document(page_content='1. 银行贷款：公司向多家银行贷款总额达人民币1亿元，部分贷款已逾期未还。\\n\\n2. 供应商欠款：因现金流紧张，公司拖欠供应商货款达人民币300万元。\\n\\n3. 员工工资及社保：由于资金链断裂，公司拖欠员工工资及社保费用共计人民币200万元。', metadata={'source': 'example/fake.docx'})]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#导入必须的包\n",
    "from langchain.document_loaders import UnstructuredExcelLoader,Docx2txtLoader,PyPDFLoader\n",
    "from langchain.text_splitter import  CharacterTextSplitter\n",
    "from langchain_openai import OpenAIEmbeddings\n",
    "from langchain.vectorstores import  Chroma\n",
    "\n",
    "\n",
    "#定义chatdoc\n",
    "class ChatDoc():\n",
    "    def __init__(self):\n",
    "        self.doc = None\n",
    "        self.splitText = [] #分割后的文本\n",
    "\n",
    "    def getFile(self):\n",
    "        doc = self.doc\n",
    "        loaders = {\n",
    "            \"docx\":Docx2txtLoader,\n",
    "            \"pdf\":PyPDFLoader,\n",
    "            \"xlsx\":UnstructuredExcelLoader,\n",
    "        }\n",
    "        file_extension = doc.split(\".\")[-1]\n",
    "        loader_class = loaders.get(file_extension)\n",
    "        if loader_class:\n",
    "            try:\n",
    "                loader = loader_class(doc)\n",
    "                text = loader.load()\n",
    "                return text\n",
    "            except Exception as e: \n",
    "                print(f\"Error loading {file_extension} files:{e}\") \n",
    "        else:\n",
    "             print(f\"Unsupported file extension: {file_extension}\")\n",
    "             return  None \n",
    "\n",
    "    #处理文档的函数\n",
    "    def splitSentences(self):\n",
    "        full_text = self.getFile() #获取文档内容\n",
    "        if full_text != None:\n",
    "            #对文档进行分割\n",
    "            text_split = CharacterTextSplitter(\n",
    "                chunk_size=150,\n",
    "                chunk_overlap=20,\n",
    "            )\n",
    "            texts = text_split.split_documents(full_text)\n",
    "            self.splitText = texts\n",
    "    \n",
    "    #向量化与向量存储\n",
    "    def embeddingAndVectorDB(self):\n",
    "        embeddings = OpenAIEmbeddings(\n",
    "             model=\"text-embedding-3-small\"\n",
    "        )\n",
    "        db =Chroma.from_documents(\n",
    "            documents = self.splitText,\n",
    "            embedding = embeddings,\n",
    "        )\n",
    "        return db\n",
    "    \n",
    "    #提问并找到相关的文本块\n",
    "    def askAndFindFiles(self,question):\n",
    "        db = self.embeddingAndVectorDB()\n",
    "        retriever = db.as_retriever()\n",
    "        results = retriever.invoke(question)\n",
    "        return results\n",
    "\n",
    "chat_doc = ChatDoc()\n",
    "chat_doc.doc = \"example/fake.docx\"\n",
    "chat_doc.splitSentences()\n",
    "chat_doc.askAndFindFiles(\"这家公司叫什么名字?\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 8. 使用多重查询提高文档检索精确度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:httpx:HTTP Request: POST https://ai-yyds.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
      "/Users/tomiezhang/.pyenv/versions/3.10.12/envs/langchains/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The class `langchain_community.chat_models.openai.ChatOpenAI` was deprecated in langchain-community 0.0.10 and will be removed in 0.2.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import ChatOpenAI`.\n",
      "  warn_deprecated(\n",
      "INFO:httpx:HTTP Request: POST https://ai-yyds.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
      "INFO:langchain.retrievers.multi_query:Generated queries: ['What is the name of the company?', 'What is the official title of the company?', 'Can you tell me the name of the organization?']\n",
      "INFO:httpx:HTTP Request: POST https://ai-yyds.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST https://ai-yyds.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST https://ai-yyds.com/v1/embeddings \"HTTP/1.1 200 OK\"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Document(page_content='一、公司基本信息\\n\\n名称：宏图科技发展有限公司\\n\\n注册地址：江苏省南京市雨花台区软件大道101号\\n\\n成立日期：2011年5月16日\\n\\n法定代表人：李强\\n\\n注册资本：人民币5000万元\\n\\n员工人数：约200人\\n\\n联系电话：025-88888888\\n\\n电子邮箱：info@hongtutech.cn', metadata={'source': 'example/fake.docx'}), Document(page_content='7. 应收账款：应收账款高达人民币600万元，回收难度大，坏账准备不足。\\n\\n三、主营业务及市场状况\\n\\n宏图科技发展有限公司主要从事计算机软件的研发与销售。近年来，由于市场竞争加剧、技术更新换代速度快和管理层决策失误等原因，公司主营业务收入持续下降。目前，公司面临的主要问题有：', metadata={'source': 'example/fake.docx'}), Document(page_content='4. 其他应付款项：包括税费、租赁费用等其他应付款项累计约人民币100万元。\\n\\n五、资产清单\\n\\n宏图科技发展有限公司目前拥有的主要资产包括：\\n\\n1. 固定资产：公司办公用房和设备原值合计人民币800万元，累计折旧约400万元。', metadata={'source': 'example/fake.docx'}), Document(page_content='报告撰写日期：2023年4月20日', metadata={'source': 'example/fake.docx'})]\n"
     ]
    }
   ],
   "source": [
    "#导入必须的包\n",
    "from langchain.document_loaders import UnstructuredExcelLoader,Docx2txtLoader,PyPDFLoader\n",
    "from langchain.text_splitter import  CharacterTextSplitter\n",
    "from langchain_openai import OpenAIEmbeddings\n",
    "from langchain.vectorstores import  Chroma\n",
    "#引入openai和多重向量检索\n",
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.retrievers.multi_query import MultiQueryRetriever\n",
    "\n",
    "\n",
    "#定义chatdoc\n",
    "class ChatDoc():\n",
    "    def __init__(self):\n",
    "        self.doc = None\n",
    "        self.splitText = [] #分割后的文本\n",
    "\n",
    "    def getFile(self):\n",
    "        doc = self.doc\n",
    "        loaders = {\n",
    "            \"docx\":Docx2txtLoader,\n",
    "            \"pdf\":PyPDFLoader,\n",
    "            \"xlsx\":UnstructuredExcelLoader,\n",
    "        }\n",
    "        file_extension = doc.split(\".\")[-1]\n",
    "        loader_class = loaders.get(file_extension)\n",
    "        if loader_class:\n",
    "            try:\n",
    "                loader = loader_class(doc)\n",
    "                text = loader.load()\n",
    "                return text\n",
    "            except Exception as e: \n",
    "                print(f\"Error loading {file_extension} files:{e}\") \n",
    "        else:\n",
    "             print(f\"Unsupported file extension: {file_extension}\")\n",
    "             return  None \n",
    "\n",
    "    #处理文档的函数\n",
    "    def splitSentences(self):\n",
    "        full_text = self.getFile() #获取文档内容\n",
    "        if full_text != None:\n",
    "            #对文档进行分割\n",
    "            text_split = CharacterTextSplitter(\n",
    "                chunk_size=150,\n",
    "                chunk_overlap=20,\n",
    "            )\n",
    "            texts = text_split.split_documents(full_text)\n",
    "            self.splitText = texts\n",
    "    \n",
    "    #向量化与向量存储\n",
    "    def embeddingAndVectorDB(self):\n",
    "        embeddings = OpenAIEmbeddings(\n",
    "            model=\"text-embedding-3-small\"\n",
    "        )\n",
    "        db =Chroma.from_documents(\n",
    "            documents = self.splitText,\n",
    "            embedding = embeddings,\n",
    "        )\n",
    "        return db\n",
    "    \n",
    "    #提问并找到相关的文本块\n",
    "    def askAndFindFiles(self,question):\n",
    "        db = self.embeddingAndVectorDB()\n",
    "        #把问题交给LLM进行多角度的扩展\n",
    "        llm = ChatOpenAI(temperature=0)\n",
    "        retriever_from_llm = MultiQueryRetriever.from_llm(\n",
    "            retriever = db.as_retriever(),\n",
    "            llm = llm,\n",
    "        )\n",
    "        return retriever_from_llm.get_relevant_documents(question)\n",
    "\n",
    "chat_doc = ChatDoc()\n",
    "chat_doc.doc = \"example/fake.docx\"\n",
    "chat_doc.splitSentences()\n",
    "#设置下logging查看生成查询\n",
    "import logging\n",
    "logging.basicConfig(level=logging.INFO)\n",
    "logging.getLogger(\"langchain.retrievers.multi_query\").setLevel(logging.DEBUG)\n",
    "unique_doc = chat_doc.askAndFindFiles(\"公司名称是什么?\")\n",
    "print(unique_doc)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 9. 使用上下文压缩检索降低冗余信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:httpx:HTTP Request: POST https://ai-yyds.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
      "/Users/tomiezhang/.pyenv/versions/3.10.12/envs/langchains/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The class `langchain_community.llms.openai.OpenAI` was deprecated in langchain-community 0.0.10 and will be removed in 0.2.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAI`.\n",
      "  warn_deprecated(\n",
      "INFO:httpx:HTTP Request: POST https://ai-yyds.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
      "/Users/tomiezhang/.pyenv/versions/3.10.12/envs/langchains/lib/python3.11/site-packages/langchain/chains/llm.py:316: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n",
      "  warnings.warn(\n",
      "INFO:httpx:HTTP Request: POST https://ai-yyds.com/v1/completions \"HTTP/1.1 200 OK\"\n",
      "/Users/tomiezhang/.pyenv/versions/3.10.12/envs/langchains/lib/python3.11/site-packages/langchain/chains/llm.py:316: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n",
      "  warnings.warn(\n",
      "INFO:httpx:HTTP Request: POST https://ai-yyds.com/v1/completions \"HTTP/1.1 200 OK\"\n",
      "/Users/tomiezhang/.pyenv/versions/3.10.12/envs/langchains/lib/python3.11/site-packages/langchain/chains/llm.py:316: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n",
      "  warnings.warn(\n",
      "INFO:httpx:HTTP Request: POST https://ai-yyds.com/v1/completions \"HTTP/1.1 200 OK\"\n",
      "/Users/tomiezhang/.pyenv/versions/3.10.12/envs/langchains/lib/python3.11/site-packages/langchain/chains/llm.py:316: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n",
      "  warnings.warn(\n",
      "INFO:httpx:HTTP Request: POST https://ai-yyds.com/v1/completions \"HTTP/1.1 200 OK\"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Document(page_content='银行贷款：公司向多家银行贷款总额达人民币1亿元，部分贷款已逾期未还。 供应商欠款：因现金流紧张，公司拖欠供应商货款达人民币300万元。 员工工资及社保：由于资金链断裂，公司拖欠员工工资及社保费用共计人民币200万元。', metadata={'source': 'example/fake.docx'}), Document(page_content='银行贷款：公司向多家银行贷款总额达人民币1亿元，部分贷款已逾期未还。 供应商欠款：因现金流紧张，公司拖欠供应商货款达人民币300万元。 员工工资及社保：由于资金链断裂，公司拖欠员工工资及社保费用共计人民币200万元。', metadata={'source': 'example/fake.docx'}), Document(page_content='银行贷款：公司向多家银行贷款总额达人民币1亿元，部分贷款已逾期未还。 供应商欠款：因现金流紧张，公司拖欠供应商货款达人民币300万元。 员工工资及社保：由于资金链断裂，公司拖欠员工工资及社保费用共计人民币200万元。', metadata={'source': 'example/fake.docx'}), Document(page_content='四、债权债务情况\\n\\n宏图科技发展有限公司目前面临的债务问题严峻，具体情况如下：', metadata={'source': 'example/fake.docx'})]\n"
     ]
    }
   ],
   "source": [
    "#导入必须的包\n",
    "from langchain.document_loaders import UnstructuredExcelLoader,Docx2txtLoader,PyPDFLoader\n",
    "from langchain.text_splitter import  CharacterTextSplitter\n",
    "from langchain_openai import OpenAIEmbeddings\n",
    "from langchain.vectorstores import  Chroma\n",
    "#引入openai和多重向量检索\n",
    "#from langchain.chat_models import ChatOpenAI\n",
    "#from langchain.retrievers.multi_query import MultiQueryRetriever\n",
    "#引入上下文压缩相关包\n",
    "from langchain.llms import  OpenAI\n",
    "from langchain.retrievers import ContextualCompressionRetriever\n",
    "from langchain.retrievers.document_compressors import  LLMChainExtractor\n",
    "\n",
    "#定义chatdoc\n",
    "class ChatDoc():\n",
    "    def __init__(self):\n",
    "        self.doc = None\n",
    "        self.splitText = [] #分割后的文本\n",
    "\n",
    "    def getFile(self):\n",
    "        doc = self.doc\n",
    "        loaders = {\n",
    "            \"docx\":Docx2txtLoader,\n",
    "            \"pdf\":PyPDFLoader,\n",
    "            \"xlsx\":UnstructuredExcelLoader,\n",
    "        }\n",
    "        file_extension = doc.split(\".\")[-1]\n",
    "        loader_class = loaders.get(file_extension)\n",
    "        if loader_class:\n",
    "            try:\n",
    "                loader = loader_class(doc)\n",
    "                text = loader.load()\n",
    "                return text\n",
    "            except Exception as e: \n",
    "                print(f\"Error loading {file_extension} files:{e}\") \n",
    "        else:\n",
    "             print(f\"Unsupported file extension: {file_extension}\")\n",
    "             return  None \n",
    "\n",
    "    #处理文档的函数\n",
    "    def splitSentences(self):\n",
    "        full_text = self.getFile() #获取文档内容\n",
    "        if full_text != None:\n",
    "            #对文档进行分割\n",
    "            text_split = CharacterTextSplitter(\n",
    "                chunk_size=150,\n",
    "                chunk_overlap=20,\n",
    "            )\n",
    "            texts = text_split.split_documents(full_text)\n",
    "            self.splitText = texts\n",
    "    \n",
    "    #向量化与向量存储\n",
    "    def embeddingAndVectorDB(self):\n",
    "        embeddings = OpenAIEmbeddings(\n",
    "            model=\"text-embedding-3-small\"\n",
    "        )\n",
    "        db =Chroma.from_documents(\n",
    "            documents = self.splitText,\n",
    "            embedding = embeddings,\n",
    "        )\n",
    "        return db\n",
    "    \n",
    "    #提问并找到相关的文本块\n",
    "    def askAndFindFiles(self,question):\n",
    "        db = self.embeddingAndVectorDB()\n",
    "        retriever = db.as_retriever()\n",
    "        llm = OpenAI(temperature=0)\n",
    "        compressor = LLMChainExtractor.from_llm(\n",
    "            llm = llm,\n",
    "        )\n",
    "        compressor_retriever = ContextualCompressionRetriever(\n",
    "            base_retriever = retriever,\n",
    "            base_compressor = compressor,\n",
    "        )\n",
    "        return compressor_retriever.get_relevant_documents(query=question)\n",
    "\n",
    "chat_doc = ChatDoc()\n",
    "chat_doc.doc = \"example/fake.docx\"\n",
    "chat_doc.splitSentences()\n",
    "#设置下logging查看生成查询\n",
    "import logging\n",
    "logging.basicConfig(level=logging.INFO)\n",
    "logging.getLogger(\"langchain.retrievers.multi_query\").setLevel(logging.DEBUG)\n",
    "unique_doc = chat_doc.askAndFindFiles(\"这间公司的负债有多少？\")\n",
    "print(unique_doc)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 11. 在向量存储里使用最大边际相似性（MMR）和相似性打分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:httpx:HTTP Request: POST https://ai-yyds.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST https://ai-yyds.com/v1/embeddings \"HTTP/1.1 200 OK\"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Document(page_content='一、公司基本信息\\n\\n名称：宏图科技发展有限公司\\n\\n注册地址：江苏省南京市雨花台区软件大道101号\\n\\n成立日期：2011年5月16日\\n\\n法定代表人：李强\\n\\n注册资本：人民币5000万元\\n\\n员工人数：约200人\\n\\n联系电话：025-88888888\\n\\n电子邮箱：info@hongtutech.cn', metadata={'source': 'example/fake.docx'})]\n"
     ]
    }
   ],
   "source": [
    "#导入必须的包\n",
    "from langchain.document_loaders import UnstructuredExcelLoader,Docx2txtLoader,PyPDFLoader\n",
    "from langchain.text_splitter import  CharacterTextSplitter\n",
    "from langchain_openai import OpenAIEmbeddings\n",
    "from langchain.vectorstores import  Chroma\n",
    "\n",
    "\n",
    "#定义chatdoc\n",
    "class ChatDoc():\n",
    "    def __init__(self):\n",
    "        self.doc = None\n",
    "        self.splitText = [] #分割后的文本\n",
    "\n",
    "    def getFile(self):\n",
    "        doc = self.doc\n",
    "        loaders = {\n",
    "            \"docx\":Docx2txtLoader,\n",
    "            \"pdf\":PyPDFLoader,\n",
    "            \"xlsx\":UnstructuredExcelLoader,\n",
    "        }\n",
    "        file_extension = doc.split(\".\")[-1]\n",
    "        loader_class = loaders.get(file_extension)\n",
    "        if loader_class:\n",
    "            try:\n",
    "                loader = loader_class(doc)\n",
    "                text = loader.load()\n",
    "                return text\n",
    "            except Exception as e: \n",
    "                print(f\"Error loading {file_extension} files:{e}\") \n",
    "        else:\n",
    "             print(f\"Unsupported file extension: {file_extension}\")\n",
    "             return  None \n",
    "\n",
    "    #处理文档的函数\n",
    "    def splitSentences(self):\n",
    "        full_text = self.getFile() #获取文档内容\n",
    "        if full_text != None:\n",
    "            #对文档进行分割\n",
    "            text_split = CharacterTextSplitter(\n",
    "                chunk_size=150,\n",
    "                chunk_overlap=20,\n",
    "            )\n",
    "            texts = text_split.split_documents(full_text)\n",
    "            self.splitText = texts\n",
    "    \n",
    "    #向量化与向量存储\n",
    "    def embeddingAndVectorDB(self):\n",
    "        embeddings = OpenAIEmbeddings(\n",
    "            model=\"text-embedding-3-small\"\n",
    "        )\n",
    "        db =Chroma.from_documents(\n",
    "            documents = self.splitText,\n",
    "            embedding = embeddings,\n",
    "        )\n",
    "        return db\n",
    "    \n",
    "    #提问并找到相关的文本块\n",
    "    def askAndFindFiles(self,question):\n",
    "        db = self.embeddingAndVectorDB()\n",
    "        #retriever = db.as_retriever(search_type=\"mmr\")\n",
    "        retriever = db.as_retriever(search_type=\"similarity_score_threshold\",search_kwargs={\"score_threshold\":.1,\"k\":1})\n",
    "        return retriever.get_relevant_documents(query=question)\n",
    "        \n",
    "\n",
    "chat_doc = ChatDoc()\n",
    "chat_doc.doc = \"example/fake.docx\"\n",
    "chat_doc.splitSentences()\n",
    "#设置下logging查看生成查询\n",
    "import logging\n",
    "logging.basicConfig(level=logging.INFO)\n",
    "logging.getLogger(\"langchain.retrievers.multi_query\").setLevel(logging.DEBUG)\n",
    "unique_doc = chat_doc.askAndFindFiles(\"这家公司的地址在哪里?\")\n",
    "print(unique_doc)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
